import re
import pandas as pd
import os
import requests

root = "./数据汇总"  # 保存文件的目录：.表示当前目录下
excel_path = "data.xls"  # 导出的excel名称
all_group = "全部数据"  # 全部数据的文件夹命名
excel_name = "数据汇总"  # 生成结果数据的保存excel


# 创建组文件夹，root根目录，group文件夹名称
def mkdir(root, group):
    if not os.path.exists(root + "/" + group):
        os.makedirs(root + "/" + group)
    # else:
    # print("该文件夹已存在" + group)


# 保存图片
def save_picture(root, group, picture_name, picture_path):
    length = len(picture_path)
    for i in range(length):
        res = requests.get(picture_path[i])
        mkdir(root + "/" + group + "/", picture_name.split("-")[2] + "-" + picture_name.split("-")[3])  # 创建个人文件夹
        if length == 1:
            save_path = root + "/" + group + "/" + picture_name.split("-")[2] + "-" + picture_name.split("-")[
                3] + "/" + picture_name + ".jpg"
        else:
            save_path = root + "/" + group + "/" + picture_name.split("-")[2] + "-" + picture_name.split("-")[
                3] + "/" + picture_name + "-" + str(i) + ".jpg"
        # 若图片之前下载过则跳过保存环节
        if os.path.exists(save_path):
            print("该图片已存在" + save_path)
            continue
        with open(save_path, 'wb') as f:
            f.write(res.content)


# 将excel表中原数据的string形式变为列表形式，为了解决多个图片的情况
def get_picture_path(picture_path_data):
    picture_path = picture_path_data.split("http")  # 通过http切割
    picture_path = picture_path[1:]  # 删除第一个空元素
    length = len(picture_path)
    for i in range(length):
        picture_path[i] = "http" + picture_path[i][:-2]  # 添加回http并删除后面的换行符
    return picture_path


# 用于对群昵称有误的进行筛查和订正
def correct_person(person):
    try:
        num = re.search(r'\d{10}', person).group()
        name = re.search(r'[\u4e00-\u9fa5]+', person).group()
    except AttributeError:
        print("--------------------圈昵称错误：" + person + "--------------------")
        return person + "-错误圈昵称"
    else:
        return num + "-" + name


# 用于对班级分组的标识,表中的用户标签
def get_groupinfo(groupinfo):
    if pd.isna(float(groupinfo)):
        return all_group
    else:
        return str(int(groupinfo))


# 对公里数填写不规范进行更改，如果发现新的规则需添加在dirty列表中
# 错误数据不会报错，会在图片公里数处显示错误数据，再求和时会报错提醒
def correct_kilometre(kilometre, date):
    month = str(int(date.split("-")[0]))  # 月份
    day = str(int(date.split("-")[1]))  # 日期
    dirty_data = [" ",
                  month + "." + day,
                  month + "月" + day + "日",
                  month + "月" + day,
                  month + "." + day + "日",
                  month + "." + day + "号",
                  month + "/" + day
                  ]
    kilo_divi = [i for i in re.split("[ ,，-]", kilometre) if i != '']  # 删除空元素
    # print(kilo_divi)

    for kilo in kilo_divi:
        # 如果数据中存在公里或者km的词，则直接选取该段的数返回
        if "公里" in kilo or "km" in kilo:
            return re.search(r"[\d+\.\d]*", kilo).group()

        # 刪除脏数据
        if kilo in dirty_data:
            kilo_divi.remove(kilo)
            continue
        # 如果里面不包含数字则删除
        if not any(chr.isdigit() for chr in kilo):
            # print(any(chr.isdigit() for chr in kilo))
            kilo_divi.remove(kilo)
            continue

    if len(kilo_divi) == 1:
        if "公里" in kilo_divi[0] or "km" in kilo_divi[0]:
            return re.search(r"[\d+\.\d]*", kilo_divi[0]).group()
        # 通过转型检查里程格式是否正确
        try:
            float(kilo_divi[0])
        except ValueError:
            return "错误里程"
        else:
            return kilo_divi[0]
    else:
        return "错误里程"


# excel表中的数据读入为列表，返回值：data_list，其最后元素为分组内容
def data(excel_path):
    data_list = []  # 全部数据的列表
    group_list = []  # 分组,最终加在data_list列表的最后
    df = pd.read_excel(excel_path)
    for i in range(df.shape[0]):
        line_data = []  # 0日期 1学号-姓名 2分组信息 3公里数 4图片路径 5图片命名
        # 0日期：04-08
        date = df.values[i, 6]
        line_data.append(date)

        # 1个人信息：学号-姓名
        person = df.values[i, 0]
        person = correct_person(person)
        line_data.append(person)

        # 2班级分组信息
        groupinfo = df.values[i, 1]
        groupinfo = get_groupinfo(groupinfo)
        line_data.append(groupinfo)
        if not groupinfo in group_list:
            group_list.append(groupinfo)

        # 3公里数：5.4
        kilometre = df.values[i, 9].split("\n")[1][1:-1]
        kilometre = correct_kilometre(kilometre, date[5:10])
        line_data.append(kilometre)

        # 4图片路径
        picture_path_data = df.values[i, 12]
        picture_path = get_picture_path(picture_path_data)
        line_data.append(picture_path)

        # 5图片命名
        picture_name = date[5:10] + "-" + person + "-" + kilometre
        line_data.append(picture_name)

        data_list.append(line_data)

    data_list.append(group_list)
    return data_list


# 下载数据
def download(root, group, data_list):
    count = 1
    for pic in data_list:
        print("\r完成进度{0}%   ".format(round(count * 100 / len(data_list), 2)), end="", flush=True)
        if group == all_group:
            save_picture(root, "全部数据", pic[5], pic[4])
        else:
            if pic[2] == group:
                save_picture(root, group, pic[5], pic[4])
        count += 1


# 被save_sum调用
# 计算分组文件夹下的个人总里程和分组总里程，并更改将结果加到文件夹命名上
# 返回该分组下成员的信息列表和分组总里程数
def sum_data(root, group):
    person_list = [["学号", "姓名", "总里程(km)"]]
    first_pass = 0  # os.walk返回的第一个路径是总路径，这里需要跳过
    group_sum = 0  # 分组计数总和
    for dir, folder, file in os.walk(root + "/" + group):
        person_sum = 0  # 个人计数
        if first_pass == 0:
            first_pass += 1
            continue

        for f in file:
            try:
                # 分情况是因为一人传多张照片命名会多个图片计数
                if len(f.split('-')) == 5:
                    group_sum += float(f.split("-")[-1][:-4])
                    person_sum += float(f.split("-")[-1][:-4])
                elif len(f.split('-')) == 6:
                    if f.split("-")[-1][:-4] == "0":
                        group_sum += float(f.split("-")[-2])
                        person_sum += float(f.split("-")[-2])
            except ValueError:
                print("--------------------里程错误：" + f + "--------------------")
        person_list.append([file[0].split("-")[2], file[0].split("-")[3], round(person_sum, 2)])
        os.rename(dir, dir + "-" + str(round(person_sum, 2)))
    os.rename(root + "/" + group, root + "/" + group + "-" + str(round(group_sum, 2)))
    return person_list, round(group_sum, 2)


# 保存数据到excel,其中直接调用sum_data
def save_sum(group_list, excel_name):
    excel_writer = pd.ExcelWriter(root + "/" + excel_name + ".xlsx")

    for group in group_list:
        data = sum_data(root, group)
        df = pd.DataFrame(data[0])
        df.to_excel(excel_writer, sheet_name=group + "-" + str(data[1]), header=False, index=False)
    excel_writer.save()
    excel_writer.close()


data = data(excel_path)
data_list = data[:-1]  # 所有数据按行为列表一个元素
group_list = data[-1]  # 所有分组信息

# 下载全部分组，下载过程中会报告数据有误，请更改后再次运行
for group in group_list:
    print("\n正在下载:" + group + "组")
    download(root, group, data_list)

# 一定要确保一次运行成功，不然会出现匹配不到文件夹的情况或者文件夹重复的情况，懒得写异常处理了
# （↑当然可以每次运行换一个root路径）
# 请确保数据全部正确后再执行，如果运行其报错，请删除下载内容，重新下载
# 请确保运行前关闭excel表
save_sum(group_list, excel_name)
